
### -------------------------------------------------- #
### ---- ASSIGN KEY VARS & SET UP ---- 
### -------------------------------------------------- #

### Clear global environment
rm(list=ls()) 

### Libraries:
library(pacman)
p_load(tidyverse, tibble, magrittr, data.table)


### -------------------------------------------------- #
### ---- Function to Load and Combine Datasets: ----
### -------------------------------------------------- #

comb_datasets_fun <- function(ABB_list, i){
  
  # Load TFA and VF sides of the match
  load(ABB_list[i,"file_name_TFA"])
  load(ABB_list[i,"file_name_VF"])
  
  # Save abbreviation (character) #changed from 14_
  ABB <- as.character(ABB_list[i,"ABB"]) 
  
  # Adjust TFA variable names #changed from 14_
  TFA_matches %<>%
    # Variables used in match
    rename_at(vars(FirstName, LastName, Sex, BirthYear,  
                   LandLine_AreaCode, LandLine_Number, CellPhone_AreaCode, CellPhone_Number, 
                   Current_Address.Zip, Permanent_Address.Zip), 
              ~paste0(.,"_tfa")) %>%
    # Remove state abbreviation from binary state vars
    rename_at(vars(starts_with("state_")), ~str_replace(., paste0("_", ABB), "") ) %>%
    # Create new version with state abbreviation instead of 1 
    mutate_at(vars(starts_with("state_")), .funs = list(temp = ~ifelse(.==1, ABB, NA))) %>%
    rename_at(vars(ends_with("_temp")), ~paste0("vf_match_", .)) %>%
    rename_at(vars(ends_with("_temp")), ~str_replace(., "_temp", "")) %>%
    # Rename binary state vars to clarify binary nature ("flags") 
    rename_at(vars(starts_with("state_")), ~paste0("vf_match_", ., "_flag") ) %>%
    # Total connections to state (all sources)
    mutate(total_sources_all = vf_match_state_ca_flag + vf_match_state_uni_flag + 
             vf_match_state_st_flag + vf_match_state_mr_flag + vf_match_state_cell_flag) %>%   
    # Total connections to state ('good' sources)
    mutate(total_sources_sub = vf_match_state_ca_flag + vf_match_state_uni_flag + vf_match_state_st_flag +
             vf_match_state_mr_flag + vf_match_state_cell_flag)   %>%
    # Total connections to state ('good' sources w/o mat region)
    mutate(total_sources_nomat = vf_match_state_ca_flag + vf_match_state_uni_flag + vf_match_state_st_flag +
             vf_match_state_cell_flag)   %>%
    # Total connections to state ('good' sources from application)
    mutate(total_sources_app = vf_match_state_ca_flag + vf_match_state_uni_flag + vf_match_state_cell_flag) 

  if("RegistrationAddressZip5" %in% variable.names(TFA_matches)){
    TFA_matches %<>% select(-RegistrationAddressZip5)
  }

  if("MailingAddressZip5" %in% variable.names(TFA_matches)){
    TFA_matches %<>% select(-MailingAddressZip5)
  }

  # Adjust VF variable names (for vars used in match) 
  VF_matches %<>%
    # Variables used in match
    rename_at(vars(FirstName, LastName, Sex, BirthYear, BirthMonth, BirthDay, 
                   LandLine_AreaCode, LandLine_Number, CellPhone_AreaCode, CellPhone_Number, 
                   MailingAddressZip5, RegistrationAddressZip5), 
              ~paste0(.,"_vf")) 
  
  # Change everything to character 
  TFA_matches %<>% mutate_all(as.character)
  VF_matches %<>% mutate_all(as.character)
  
  # Combine TFA and VF sides of the match 
  ABB_comb <- cbind(TFA_matches, VF_matches) %>% 
    # Add variable with state matched
    mutate(state_vf_matched = ABB)
  
  return(ABB_comb)
}

### -------------------------------------------------- #
### ---- Function to Combine Data: ----
### -------------------------------------------------- #

fun_everything <- function(attempt_no=1){

  ### ---- ASSIGN KEY VARS & SET UP ---- 
  
  ### File Paths
  file_path <- ""
  match_file_path <- paste0(file_path,"Temp_Data/State_VF_Matches/")
  
  ### Get list of state abbreviations #changed from 14_
  data(state)
  ABB <- c(state.abb, "DC")
  
  ### Generate root file path for each state #changed from 14_
  if(attempt_no==1){
    files <- paste0(match_file_path,"Matched_",ABB,"_bday_parts_v2")
  } else {
    stop("updated function only works for attempt no 1")
  }
  ### Generate TFA and VF file paths for each state 
  ABB_list <- data.frame(ABB = ABB,
                         root_fp = files) %>%
    mutate(file_name_TFA = paste0(root_fp,"_TFA.RData"),
           file_name_VF = paste0(root_fp,"_VF.RData") ) %>%
    select(-root_fp)
  
  
  
  ### ---- Load and Combine Datasets: ----
  
  ### Run initial function for loading and combining datasets
  for(i in 1:51){
    if(i==1){
      comb_tfa_matches <- comb_datasets_fun(ABB_list=ABB_list, i)
    } else {
      comb_tfa_matches %<>% rbind(. , comb_datasets_fun(ABB_list=ABB_list, i) ) #changed from 14_
    }
    print(paste0("Finished adding data for ",ABB_list[i,"ABB"], "."))
  }
  
  ### ---- Add/Clean Variables (Combined Data): ----
  
  ### Print progress message
  print("Starting cleaning and adding vars to combined dataset.")
  
  ### Standardize non-voter value as zero
  # Election variable names
    elec_vars <- variable.names(comb_tfa_matches)[variable.names(comb_tfa_matches) %like% "VH"]
  
  # Clean election vars 
    comb_tfa_matches %<>% 
      # Trim whitespace from election vars
        mutate_at(vars(starts_with("VH")), trimws) 
  print("Trimmed white space from election vars.")      
  
  # Check that values of VH vars range from 1 to 10
    print("Verify that values of VH variables range from 1 to 10.")
    for(i in 1:length(elec_vars)){
      levels( as.factor(comb_tfa_matches[,elec_vars[i]]) ) %>% print()
    } 

  # Create vector with possible election var values 
    elec_vals <- as.character(c(1:10))
  
  # Code Missing Values 
    comb_tfa_matches %<>%  
      # If election val not in list, make value NA 
        mutate_at(elec_vars, funs(ifelse(!. %in% elec_vals | is.na(.), 0, .))) #%>%
  print("Coded empty VH values as zero.")  

  ### Add orig. TFA vars AND add non-matched applicants back into dataset 
  # Load tfa_dat_match
    load(paste0(file_path,"Temp_Data","/tfa_to_state_ALL_v2.RData")) 
    tfa_dat_match %<>% mutate_all(as.character)
  # Add flag for successful match to voter file 
    comb_tfa_matches %<>% mutate(matched_to_vf = 1)
  # Merge into original tfa dataset (w/ state matches)
    comb_tfa_matches <- left_join(tfa_dat_match,
                                  comb_tfa_matches,
                                  by = "personid") 
    print("Merged orig. TFA vars and unmatched applicants.")
  
 ### Add cleaned TFA vars #changed from 14_   
  # Load tfa_dat_flag
    load(file=paste0(file_path,"Temp_Data/tfa_to_state_ALL_flags_v2.RData"))  
  # Subset to cleaned variables
    tfa_dat_flag %<>% 
      select("personid", "FirstName", "LastName", "Sex", "new_degreedate", "DegYear", 
             "DegMonth", "DegDay",  "new_dateofbirth", "BirthYear", #"BirthMonth",
             # "BirthDay", "BirthDate_full", 
             "homephone_clean", "workphone_clean", 
             "cellphone_clean", "homephone_area_code", "workphone_area_code", 
             "cellphone_area_code", "homephone_number", "workphone_number", 
             "cellphone_number", "Current_Address.Zip_old",
             "Permanent_Address.Zip_old", "LandLine_AreaCode", "LandLine_Number", 
             "CellPhone_AreaCode", "CellPhone_Number") %>%
      mutate_all(as.character)
  # Merge
    comb_tfa_matches <- left_join(comb_tfa_matches,
                                  tfa_dat_flag,
                                  by = "personid")     
  print("Merged additional cleaned TFA vars.")
  
  ### Print progress message
  print("Finished cleaning and adding vars to combined dataset.")
  
  ### ---- SAVE MATCHED DATASET ----
  
  ### Save combined dataset 
  save(comb_tfa_matches, 
       file=paste0(file_path,"Temp_Data","/comb_matches_wDups_m",attempt_no,"_v2.RData"))
  print("Saved combined data.")

  ### ---- Summarize: No. of Matches ----
  
  ### Summarize: No. of Obs with 0 or 2+ Insitution States
  print("Dups with all sources included.")
  comb_tfa_matches %>%
    # Summarize number of instances of applicant in combined dataset
    group_by(personid) %>% 
    summarise(no_of_matches = n(),
              matched_to_vf_temp = sum(matched_to_vf)) %>%
    ungroup() %>%
    # Seperate 0 matches from 1 match 
    mutate(no_of_matches = ifelse(is.na(matched_to_vf_temp) | matched_to_vf_temp==0, 0, no_of_matches) ) %>%
    # Summarize number of applicants by number of matches
    group_by(no_of_matches) %>%
    summarize(applicants = n()) 
  
  ### Remove non-application or bad sources
  print("Dups after subsetting to 'good' application sources.")
  comb_tfa_matches %>%
    # Filter by total 'good' application sources 
    filter(total_sources_app > 0 & !is.na(total_sources_app)) %>% 
    # Summarize number of instances of applicant in combined dataset
    group_by(personid) %>%
    summarize(no_of_matches = n(), 
              matched_to_vf_temp = sum(matched_to_vf)) %>%
    ungroup() %>%
    # Separate 0 matches from 1 match 
    mutate(no_of_matches = ifelse(is.na(matched_to_vf_temp) | matched_to_vf_temp==0, 0, no_of_matches) ) %>%
    # Summarize number of applicants by number of matches
    group_by(no_of_matches) %>%
    summarize(applicants = n()) %>%
    # Zero total is no longer accurate
    filter(no_of_matches != 0)
}

### -------------------------------------------------- #
### ---- Run Function to Combine Data: ----
### -------------------------------------------------- #

rm(list = setdiff(ls(), lsf.str()))
fun_everything(attempt_no=1)



